##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.1.6
## ✔ ggplot2 4.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'janitor'
##
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
##
##
## Loading required package: lattice
##
##
## Attaching package: 'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
##
## Attaching package: 'MASS'
##
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loaded glmnet 4.1-10
##
## Loading required package: mvtnorm
##
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df <- readr::read_tsv("data/marketing_campaign.csv") %>%
janitor::clean_names()
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Content
AcceptedCmp1 - 1 if customer accepted the offer in the 1st campaign, 0 otherwise AcceptedCmp2 - 1 if customer accepted the offer in the 2nd campaign, 0 otherwise AcceptedCmp3 - 1 if customer accepted the offer in the 3rd campaign, 0 otherwise AcceptedCmp4 - 1 if customer accepted the offer in the 4th campaign, 0 otherwise AcceptedCmp5 - 1 if customer accepted the offer in the 5th campaign, 0 otherwise Response (target) - 1 if customer accepted the offer in the last campaign, 0 otherwise Complain - 1 if customer complained in the last 2 years DtCustomer - date of customer’s enrolment with the company Education - customer’s level of education Marital - customer’s marital status Kidhome - number of small children in customer’s household Teenhome - number of teenagers in customer’s household Income - customer’s yearly household income MntFishProducts - amount spent on fish products in the last 2 years MntMeatProducts - amount spent on meat products in the last 2 years MntFruits - amount spent on fruits products in the last 2 years MntSweetProducts - amount spent on sweet products in the last 2 years MntWines - amount spent on wine products in the last 2 years MntGoldProds - amount spent on gold products in the last 2 years NumDealsPurchases - number of purchases made with discount NumCatalogPurchases - number of purchases made using catalogue NumStorePurchases - number of purchases made directly in stores NumWebPurchases - number of purchases made through company’s web site NumWebVisitsMonth - number of visits to company’s web site in the last month Recency - number of days since the last purchase
glimpse(df)
## Rows: 2,240
## Columns: 29
## $ id <dbl> 5524, 2174, 4141, 6182, 5324, 7446, 965, 6177, 4…
## $ year_birth <dbl> 1957, 1954, 1965, 1984, 1981, 1967, 1971, 1985, …
## $ education <chr> "Graduation", "Graduation", "Graduation", "Gradu…
## $ marital_status <chr> "Single", "Single", "Together", "Together", "Mar…
## $ income <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635,…
## $ kidhome <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, …
## $ teenhome <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, …
## $ dt_customer <chr> "04-09-2012", "08-03-2014", "21-08-2013", "10-02…
## $ recency <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 68, 11, 59, …
## $ mnt_wines <dbl> 635, 11, 426, 11, 173, 520, 235, 76, 14, 28, 5, …
## $ mnt_fruits <dbl> 88, 1, 49, 4, 43, 42, 65, 10, 0, 0, 5, 16, 61, 2…
## $ mnt_meat_products <dbl> 546, 6, 127, 20, 118, 98, 164, 56, 24, 6, 6, 11,…
## $ mnt_fish_products <dbl> 172, 2, 111, 10, 46, 0, 50, 3, 3, 1, 0, 11, 225,…
## $ mnt_sweet_products <dbl> 88, 1, 21, 3, 27, 42, 49, 1, 3, 1, 2, 1, 112, 5,…
## $ mnt_gold_prods <dbl> 88, 6, 42, 5, 15, 14, 27, 23, 2, 13, 1, 16, 30, …
## $ num_deals_purchases <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 1, 1, 3, 1, 1, …
## $ num_web_purchases <dbl> 8, 1, 8, 2, 5, 6, 7, 4, 3, 1, 1, 2, 3, 6, 1, 7, …
## $ num_catalog_purchases <dbl> 10, 1, 2, 0, 3, 4, 3, 0, 0, 0, 0, 0, 4, 1, 0, 6,…
## $ num_store_purchases <dbl> 4, 2, 10, 4, 6, 10, 7, 4, 2, 0, 2, 3, 8, 5, 3, 1…
## $ num_web_visits_month <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 20, 7, 8, 2, 6, 8, 3,…
## $ accepted_cmp3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ accepted_cmp5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ accepted_cmp2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ complain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ z_cost_contact <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ z_revenue <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, …
## $ response <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, …
dim(df)
## [1] 2240 29
Only the “Income” variable has zero values. Since there are only 24 missing values, it was decided to eliminate them directly.
skimr::skim(df)
| Name | df |
| Number of rows | 2240 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| education | 0 | 1 | 3 | 10 | 0 | 5 | 0 |
| marital_status | 0 | 1 | 4 | 8 | 0 | 8 | 0 |
| dt_customer | 0 | 1 | 10 | 10 | 0 | 663 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 5592.16 | 3246.66 | 0 | 2828.25 | 5458.5 | 8427.75 | 11191 | ▇▇▇▇▇ |
| year_birth | 0 | 1.00 | 1968.81 | 11.98 | 1893 | 1959.00 | 1970.0 | 1977.00 | 1996 | ▁▁▂▇▅ |
| income | 24 | 0.99 | 52247.25 | 25173.08 | 1730 | 35303.00 | 51381.5 | 68522.00 | 666666 | ▇▁▁▁▁ |
| kidhome | 0 | 1.00 | 0.44 | 0.54 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▆▁▁ |
| teenhome | 0 | 1.00 | 0.51 | 0.54 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▇▁▁ |
| recency | 0 | 1.00 | 49.11 | 28.96 | 0 | 24.00 | 49.0 | 74.00 | 99 | ▇▇▇▇▇ |
| mnt_wines | 0 | 1.00 | 303.94 | 336.60 | 0 | 23.75 | 173.5 | 504.25 | 1493 | ▇▂▂▁▁ |
| mnt_fruits | 0 | 1.00 | 26.30 | 39.77 | 0 | 1.00 | 8.0 | 33.00 | 199 | ▇▁▁▁▁ |
| mnt_meat_products | 0 | 1.00 | 166.95 | 225.72 | 0 | 16.00 | 67.0 | 232.00 | 1725 | ▇▁▁▁▁ |
| mnt_fish_products | 0 | 1.00 | 37.53 | 54.63 | 0 | 3.00 | 12.0 | 50.00 | 259 | ▇▁▁▁▁ |
| mnt_sweet_products | 0 | 1.00 | 27.06 | 41.28 | 0 | 1.00 | 8.0 | 33.00 | 263 | ▇▁▁▁▁ |
| mnt_gold_prods | 0 | 1.00 | 44.02 | 52.17 | 0 | 9.00 | 24.0 | 56.00 | 362 | ▇▁▁▁▁ |
| num_deals_purchases | 0 | 1.00 | 2.33 | 1.93 | 0 | 1.00 | 2.0 | 3.00 | 15 | ▇▂▁▁▁ |
| num_web_purchases | 0 | 1.00 | 4.08 | 2.78 | 0 | 2.00 | 4.0 | 6.00 | 27 | ▇▃▁▁▁ |
| num_catalog_purchases | 0 | 1.00 | 2.66 | 2.92 | 0 | 0.00 | 2.0 | 4.00 | 28 | ▇▂▁▁▁ |
| num_store_purchases | 0 | 1.00 | 5.79 | 3.25 | 0 | 3.00 | 5.0 | 8.00 | 13 | ▂▇▂▃▂ |
| num_web_visits_month | 0 | 1.00 | 5.32 | 2.43 | 0 | 3.00 | 6.0 | 7.00 | 20 | ▅▇▁▁▁ |
| accepted_cmp3 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp4 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp5 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp1 | 0 | 1.00 | 0.06 | 0.25 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| accepted_cmp2 | 0 | 1.00 | 0.01 | 0.11 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| complain | 0 | 1.00 | 0.01 | 0.10 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| z_cost_contact | 0 | 1.00 | 3.00 | 0.00 | 3 | 3.00 | 3.0 | 3.00 | 3 | ▁▁▇▁▁ |
| z_revenue | 0 | 1.00 | 11.00 | 0.00 | 11 | 11.00 | 11.0 | 11.00 | 11 | ▁▁▇▁▁ |
| response | 0 | 1.00 | 0.15 | 0.36 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
df_clean <- df %>%
mutate(
across(c(
id, education, marital_status,
kidhome, teenhome,
accepted_cmp1, accepted_cmp2, accepted_cmp3,
accepted_cmp4, accepted_cmp5, response
), as.factor)
)
df_clean <- df_clean %>% tidyr::drop_na(income)
sum(is.na(df_clean))
## [1] 0
The variables “z_cost_contact” and “z_revenue” have a constant value, as they do not add any information and are therefore eliminated from the analysis.
df %>%
summarise(across(everything(), n_distinct)) %>%
pivot_longer(everything(), names_to = "var", values_to = "n_unique") %>%
filter(n_unique == 1)
## # A tibble: 2 × 2
## var n_unique
## <chr> <int>
## 1 z_cost_contact 1
## 2 z_revenue 1
df_clean <- df %>% dplyr::select(-z_cost_contact, -z_revenue)
Two variables with almost zero variance were found and therefore eliminated.
nzv <- nearZeroVar(df_clean, saveMetrics = TRUE)
nzv[nzv$zeroVar | nzv$nzv, ]
## freqRatio percentUnique zeroVar nzv
## accepted_cmp2 73.66667 0.08928571 FALSE TRUE
## complain 105.66667 0.08928571 FALSE TRUE
df_clean <- df_clean[, !nzv$zeroVar & !nzv$nzv]
The “Yolo,” “Absurd,” and ‘Alone’ modes of the “marital_status” variable have been removed, as they have a very low presence and distort the analysis.
df_clean <- df_clean %>%
filter(!marital_status %in% c("YOLO", "Absurd")) %>%
mutate(marital_status = fct_recode(marital_status,
"Single" = "Alone"
)) %>%
mutate(marital_status = droplevels(marital_status))
count(df_clean, marital_status)
## # A tibble: 5 × 2
## marital_status n
## <fct> <int>
## 1 Single 483
## 2 Divorced 232
## 3 Married 864
## 4 Together 580
## 5 Widow 77
The variables relating to children and young people in households have been grouped together under a single variable, “Children.”
df_clean <- df_clean %>%
mutate(children = kidhome + teenhome) %>%
dplyr::select(-kidhome, -teenhome)
df_clean <- df_clean %>%
mutate(
total_spent =
mnt_wines + mnt_fruits + mnt_meat_products +
mnt_fish_products + mnt_sweet_products +
mnt_gold_prods
)
df_clean <- df_clean %>%
mutate(age = 2024 - year_birth) %>%
dplyr::select(-year_birth)
A logarithmic transformation was applied to the Income feature to correct for the strong positive asymmetry (right-skewness) typical of income distributions.
df_clean <- df_clean %>%
mutate(income_log = log(income)) %>%
dplyr::select(-income)
df_clean <- df_clean %>%
na.omit(is.na(income_log))
dim(df_clean)
## [1] 2212 25
df_clean %>%
ggplot(aes(x = factor(response))) +
geom_bar(fill = "#4E79A7", color = "white", alpha = 0.9, width = 0.6) +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
labs(
title = "Distribution Response",
x = "Response",
y = "Count"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean %>%
ggplot(aes(x = income_log)) +
geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Income",
x = "Log Income",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean <- df_clean %>% filter(id != 9432)
df_clean %>%
ggplot(aes(x = income_log)) +
geom_histogram(bins = 60, fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Log Income",
x = "Log Income",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean %>%
ggplot(aes(x = reorder(education, education, function(x) -length(x)))) +
geom_bar(fill = "#4682B4", color = "white", alpha = 0.8, width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, color = "#333333") +
labs(
title = "Level of Education",
x = "Educational Qualification",
y = "Count"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
total_spent is a variable created to show the total expenditure incurred by the various instances. Its marked asymmetry is due to the underlying asymmetry of the various variables that comprise it.
df_clean %>%
ggplot(aes(x = total_spent)) +
geom_histogram(bins = 40, fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Total Spent",
x = "Total Spent",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
df_clean %>%
ggplot(aes(x = factor(children))) +
geom_bar(fill = "#4682B4", color = "white", alpha = 0.8) +
labs(
title = "Distribution Children",
x = "Children",
y = "Counts"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10)),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
)
As can be seen from the graphs, all the “mnt_” variables show strong
asymmetry.
colonne_mnt <- df_clean %>%
dplyr::select(starts_with("mnt")) %>%
names()
lista_grafici <- map(colonne_mnt, function(col_name) {
ggplot(df_clean, aes(x = .data[[col_name]])) +
geom_histogram(bins = 30, fill = "#4682B4", color = "white", alpha = 0.8) +
scale_x_continuous(
labels = label_dollar(prefix = "€ ", big.mark = ".", decimal.mark = ",")
) +
labs(
title = paste("Distribution", col_name),
x = "Amount spent",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
panel.grid.minor = element_blank()
)
})
names(lista_grafici) <- colonne_mnt
walk(lista_grafici, print)
Recency is crucial for calculating the churn rate, indicating the number
of days since the last purchase.
df_clean %>%
ggplot(aes(x = recency)) +
geom_histogram(binwidth = 5, fill = "#4682B4", color = "white", alpha = 0.8) +
scale_x_continuous(breaks = seq(0, 100, 10)) +
labs(
title = "Distribution Recency",
subtitle = "Days since the customer's last purchase",
x = "Days since last purchase",
y = "Number of Customers"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, color = "#333333"),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.title.x = element_text(margin = ggplot2::margin(t = 10)),
axis.title.y = element_text(margin = ggplot2::margin(r = 10))
)
There is a strong correlation (0.64) between income and spending on wine (mnt_wines). High income is positively linked to purchases in stores (0.63) and from catalogs (0.59). There is a strong negative correlation (-0.61) between income and visits to the website. The correlation (0.74) is between mnt_meat_products and num_catalog_purchases (those who buy a lot of meat tend to do so via catalog). Those who spend on fish also tend to spend on fruit (0.59), meat (0.57), and desserts (0.59).
df_corr <- df_clean %>%
dplyr::select(where(is.numeric)) %>%
dplyr::select(-total_spent)%>%
dplyr::select(-matches("ID|id"))
corr_matrix <- cor(df_corr, use = "complete.obs")
ggcorrplot(corr_matrix,
method = "square",
type = "lower",
lab = FALSE,
colors = c("#B2182B", "white", "#E41A1C"),
title = "Correlation Matrix",
ggtheme = theme_minimal()
) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid = element_blank()
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
classifica_correlazioni <- as.data.frame(corr_matrix) %>%
rownames_to_column(var = "Var1") %>%
pivot_longer(cols = -Var1, names_to = "Var2", values_to = "Correlazione") %>%
filter(Var1 < Var2) %>%
arrange(desc(abs(Correlazione)))
print("Higher correlations")
## [1] "Higher correlations"
head(classifica_correlazioni, 10)
## # A tibble: 10 × 3
## Var1 Var2 Correlazione
## <chr> <chr> <dbl>
## 1 mnt_meat_products num_catalog_purchases 0.735
## 2 mnt_wines num_store_purchases 0.640
## 3 income_log mnt_wines 0.638
## 4 mnt_wines num_catalog_purchases 0.635
## 5 income_log num_store_purchases 0.625
## 6 income_log num_web_visits_month -0.607
## 7 income_log num_catalog_purchases 0.593
## 8 mnt_fish_products mnt_fruits 0.592
## 9 mnt_fish_products mnt_sweet_products 0.586
## 10 mnt_fish_products mnt_meat_products 0.574
df_clean <- df_clean%>%
dplyr::select(-dt_customer)
Mixture of Generalized Hyperbolic Distributions (MixGHD)
vars_clustering <- c("mnt_wines", "mnt_fruits", "mnt_meat_products",
"mnt_fish_products", "mnt_sweet_products", "mnt_gold_prods")
X <- df_clean %>%
dplyr::select(all_of(vars_clustering)) %>%
scale()
set.seed(1926)
mod_mghd <- MGHD(data = X, G = 2:5, modelSel = "BIC", scale = FALSE) #sceglie tra 2 e 5 cluster
## The best model (BIC) for the range of components used is G = 4.
## The BIC for this model is -7830.217.
df_clean$cluster <- factor(mod_mghd@map)
summary(mod_mghd)
## The number components used for the model is G = 4.
## BIC = -7830.217. AIC = -7151.774. AIC3 = -7270.774. ICL = -8215.909.
## Cluster N. of elements
## 1 1 311
## 2 2 857
## 3 3 537
## 4 4 506
plot(mod_mghd)
pca_res <- prcomp(X, scale. = TRUE)
num_cluster <- length(unique(mod_mghd@map))
df_plot <- data.frame(
PC1 = pca_res$x[, 1],
PC2 = pca_res$x[, 2],
Cluster = factor(mod_mghd@map)
)
ggplot(df_plot, aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
stat_ellipse(geom = "polygon", alpha = 0.2, level = 0.95) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
labs(
title = "Clustering MGHD",
subtitle = paste("Cluster:", num_cluster),
x = paste0("PC1 (", round(summary(pca_res)$importance[2,1]*100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca_res)$importance[2,2]*100, 1), "%)")
) +
theme_minimal() +
theme(legend.position = "bottom")
profile <- df_clean %>%
group_by(cluster) %>%
summarise(
Num_Consumer = n(),
Avg_Wines = mean(mnt_wines),
Avg_Meat = mean(mnt_meat_products),
Avg_Gold = mean(mnt_gold_prods),
Avg_Income = exp(mean(income_log)),
Avg_Age = mean(age),
Avg_web = mean(num_web_visits_month),
Avg_purchases = mean(num_deals_purchases),
Avg_tot_spent=mean(total_spent)
) %>%
arrange(desc(Avg_tot_spent))
profile
## # A tibble: 4 × 10
## cluster Num_Consumer Avg_Wines Avg_Meat Avg_Gold Avg_Income Avg_Age Avg_web
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 506 529. 410. 90.6 71500. 55.9 3.37
## 2 1 311 593. 322. 28.2 66510. 57.3 4.05
## 3 3 537 366. 83.6 55.1 49920. 57.7 6.24
## 4 2 857 30.0 19.8 14.8 30688. 52.4 6.36
## # ℹ 2 more variables: Avg_purchases <dbl>, Avg_tot_spent <dbl>
df_clean$Segmento <- factor(df_clean$cluster,
levels = c("4", "1", "3", "2"),
labels = c("Premium",
"High Potential",
"Value Seekers",
"Budget/Low Income"))
table(df_clean$Segmento)
##
## Premium High Potential Value Seekers Budget/Low Income
## 506 311 537 857
ggplot(df_clean, aes(x = Segmento, y = total_spent, fill = Segmento)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
scale_y_continuous(labels = scales::dollar_format()) +
# coord_cartesian taglia solo la visualizzazione, non i dati
coord_cartesian(ylim = c(0, 2500)) +
theme_minimal() +
labs(title = "Type of Consumer for Total Spent",
x = "Type of Consumer",
y = "Total Spent") +
theme(legend.position = "none")
ggplot(df_clean, aes(x = Segmento, y = income_log, fill = Segmento)) +
# Uso geom_boxplot invece di geom_density
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
scale_y_continuous() +
theme_minimal() +
labs(title = "Type of Consumer for Total Spent",
x = "Type of Consumer",
y = "Income_log") +
theme(legend.position = "none")
For 50% of the data, the uncertainty is less than 0.8% (median value), while the average is much higher, so there are few observations that have a lot of uncertainty (asymmetric distribution).
n_obs <- length(mod_mghd@map)
n_clusters <- 4
z_matrix <- matrix(mod_mghd@z, nrow = n_obs, ncol = n_clusters)
uncertainty <- 1 - apply(z_matrix, 1, max)
summary(uncertainty)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0008156 0.0084983 0.0730895 0.0831913 0.5775939
The categories overlap and are not clearly distinct.
df_plot_unc <- data.frame(
PC1 = pca_res$x[, 1],
PC2 = pca_res$x[, 2],
Cluster = factor(mod_mghd@map),
Uncertainty = uncertainty
)
df_plot_unc <- df_plot_unc[order(df_plot_unc$Uncertainty), ]
ggplot(df_plot_unc, aes(x = PC1, y = PC2, color = Uncertainty)) +
geom_point(size = 2, alpha = 0.8) +
scale_color_gradient(low = "blue", high = "red") +
theme_minimal() +
labs(
title = "Classification Uncertainty Map",
color = "Incertezza"
)
Those belonging to cluster 4 purchase most of the food basket, but are not the leading purchasers of wine. Those belonging to cluster 1 are the leading consumers of wine, and favor meat and fish, neglecting everything else. Those belonging to cluster 2 all have negative values and are the opposite class to Premium (cluster 4). Finally, those belonging to cluster 3 are below average for meat, fruit, and fish but above average for wine and gold products.
df_final <- as.data.frame(X)
df_final$Cluster <- factor(mod_mghd@map)
cluster_profile <- df_final %>%
group_by(Cluster) %>%
summarise(across(everything(), mean)) %>%
pivot_longer(-Cluster, names_to = "Variabile", values_to = "Media") %>%
pivot_wider(names_from = Cluster, values_from = Media, names_prefix = "Cluster_") %>%
mutate(across(where(is.numeric), ~ round(., 3)))
print(as.data.frame(cluster_profile))
## Variabile Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 1 mnt_wines 0.853 -0.815 0.182 0.663
## 2 mnt_fruits 0.086 -0.506 -0.486 1.321
## 3 mnt_meat_products 0.689 -0.656 -0.372 1.083
## 4 mnt_fish_products 0.508 -0.537 -0.499 1.126
## 5 mnt_sweet_products 0.108 -0.513 -0.488 1.321
## 6 mnt_gold_prods -0.302 -0.562 0.218 0.906
profile_mghd <- df_clean%>%
group_by(Segmento)%>%
summarise(
Count = n(),
Avg_Wines = mean(mnt_wines),
Avg_Meat = mean(mnt_meat_products),
Avg_Gold = mean(mnt_gold_prods),
Avg_Fish = mean(mnt_fish_products),
Avg_Income = exp(mean(income_log)),
Avg_Age = mean(age),
Avg_web = mean(num_web_visits_month),
Avg_purchases = mean(num_deals_purchases),
Avg_tot_spent=mean(total_spent)
) %>%
arrange(desc(Avg_tot_spent))
print(profile_mghd)
## # A tibble: 4 × 11
## Segmento Count Avg_Wines Avg_Meat Avg_Gold Avg_Fish Avg_Income Avg_Age Avg_web
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Premium 506 529. 410. 90.6 99.0 71500. 55.9 3.37
## 2 High Po… 311 593. 322. 28.2 65.3 66510. 57.3 4.05
## 3 Value S… 537 366. 83.6 55.1 10.3 49920. 57.7 6.24
## 4 Budget/… 857 30.0 19.8 14.8 8.23 30688. 52.4 6.36
## # ℹ 2 more variables: Avg_purchases <dbl>, Avg_tot_spent <dbl>
df_full <- as.data.frame(X)
df_full$Cluster <- factor(mod_mghd@map)
df_full$Uncertainty <- uncertainty
df_clean_cc <- df_full[df_full$Uncertainty < 0.2, ]
cat("Distribution of Cluster after filtering:\n")
## Distribution of Cluster after filtering:
print(table(df_clean_cc$Cluster))
##
## 1 2 3 4
## 229 788 456 431
cluster_means <- df_clean_cc %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean)) %>%
dplyr::select(-Uncertainty) %>%
pivot_longer(-Cluster, names_to = "Variabile", values_to = "Media") %>%
pivot_wider(names_from = Cluster, values_from = Media, names_prefix = "Cluster_") %>%
mutate(across(where(is.numeric), ~ round(., 3)))
print(as.data.frame(cluster_means))
## Variabile Cluster_1 Cluster_2 Cluster_3 Cluster_4
## 1 mnt_wines 0.938 -0.830 0.210 0.667
## 2 mnt_fruits 0.021 -0.530 -0.497 1.434
## 3 mnt_meat_products 0.658 -0.671 -0.378 1.098
## 4 mnt_fish_products 0.461 -0.553 -0.514 1.164
## 5 mnt_sweet_products 0.050 -0.528 -0.497 1.438
## 6 mnt_gold_prods -0.353 -0.590 0.286 1.002
The cluster that generates the most uncertainty is cluster 1, i.e., those who purchase the most wine.
check_uncertainty <- df_full %>%
group_by(Cluster) %>%
summarise(
Incertezza_Media = mean(Uncertainty),
Incertezza_Max = max(Uncertainty),
Casi_Critici = sum(Uncertainty > 0.2),
Totale_Casi = n()
) %>%
mutate(Perc_Critici = round(Casi_Critici / Totale_Casi * 100, 1))
print(check_uncertainty)
## # A tibble: 4 × 6
## Cluster Incertezza_Media Incertezza_Max Casi_Critici Totale_Casi Perc_Critici
## <fct> <dbl> <dbl> <int> <int> <dbl>
## 1 1 0.141 0.524 82 311 26.4
## 2 2 0.0468 0.506 69 857 8.1
## 3 3 0.0697 0.500 81 537 15.1
## 4 4 0.0796 0.578 75 506 14.8
ggplot(df_full, aes(x = Cluster, y = Uncertainty, fill = Cluster)) +
geom_boxplot(alpha = 0.7) +
geom_hline(yintercept = 0.2, linetype = "dashed", color = "red", size = 1) +
labs(
title = "Distribution of Uncertainty",
y = "Uncertainty (!-Probability)"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Cluster 1 tends to overlap with cluster 4. Wine consumers share with
premium consumers a high consumption of meat. However, in cluster 1, the
rest of the products are not consumed in large quantities, so when one
of these consumers buys a little more fruit (for example), they begin to
resemble a premium customer.
z_matrix <- matrix(mod_mghd@z, nrow = nrow(X), ncol = 4)
idx_incerti_c1 <- which(mod_mghd@map == 1 & (1 - apply(z_matrix, 1, max)) > 0.2)
second_best <- apply(z_matrix[idx_incerti_c1, ], 1, function(row) {
order(row, decreasing = TRUE)[2]
})
table(second_best)
## second_best
## 2 3 4
## 11 15 56
The graph confirms that there are heavy wine drinkers in both clusters. “Is it a Wine Lover who made a big purchase today? Or is it a Premium who bought less than usual?”
# La variabile other_expenses considera tutti gli acquisti tranne quelli del vino
df_comparison <- as.data.frame(X) %>%
mutate(
Cluster = factor(mod_mghd@map),
Uncertainty = uncertainty,
Other_Expenses = mnt_fruits + mnt_meat_products + mnt_fish_products + mnt_sweet_products + mnt_gold_prods
) %>%
filter(Cluster %in% c("1", "4"))
ggplot(df_comparison, aes(x = mnt_wines, y = Other_Expenses, color = Cluster)) +
geom_point(data = subset(df_comparison, Uncertainty < 0.2),
alpha = 0.4, size = 1.5) +
geom_point(data = subset(df_comparison, Uncertainty >= 0.2),
aes(shape = "Incerto"), size = 3, alpha = 0.8, stroke = 1.2) +
scale_color_manual(values = c("1" = "red", "4" = "blue")) +
labs(
title = "Wine Lovers (1) vs Premium (4)",
x = "Wine (std)",
y = "Other Expenses (std)",
shape = "Stato"
) +
theme_minimal()
#i punti più grandi sono quelli che il clustering confonde
The clusters have free shape, volume, and orientation. The number of degrees of freedom is the same for all clusters.
library(teigen)
set.seed(1926)
X_mat <- as.matrix(X)
#l'algoritmo sceglie liberamente il modello da utilizzare
mod_teigen <- teigen(X_mat, Gs = 2:6, verbose = FALSE)
(mod_teigen$modelname)
## [1] "UUUC"
(mod_teigen$G)
## [1] 6
summary(mod_teigen)
## ------------- Summary for teigen -------------
## ------ RESULTS ------
## Loglik: -3380.51
## BIC: -8054.821
## ICL: -8411.959
## Model: UUUC
## # Groups: 6
##
##
## Clustering Table:
##
## 1 2 3 4 5 6
## 251 262 691 423 227 357
if (is.null(mod_teigen$map)) {
cluster_teigen <- factor(apply(mod_teigen$fuzzy, 1, which.max))
} else {
cluster_teigen <- factor(mod_teigen$map)
}
print(length(cluster_teigen))
## [1] 2211
df_plot_teigen <- data.frame(
PC1 = pca_res$x[, 1],
PC2 = pca_res$x[, 2],
Cluster = cluster_teigen
)
ggplot(df_plot_teigen, aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
geom_point(alpha = 0.6, size = 2) +
stat_ellipse(geom = "polygon", alpha = 0.2, level = 0.95) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
labs(
title = "Clustering Teigen (Modello: UUUC)",
subtitle = "T-test with constant heavy tails",
x = "PC1",
y = "PC2"
) +
theme_minimal() +
theme(legend.position = "bottom")
df_clean$Cluster_Teigen <- cluster_teigen
profile_teigen <- df_clean %>%
group_by(Cluster_Teigen) %>%
summarise(
Count = n(),
Avg_Wines = mean(mnt_wines),
Avg_Meat = mean(mnt_meat_products),
Avg_Gold = mean(mnt_gold_prods),
Avg_Fish = mean(mnt_fish_products),
Avg_Income = exp(mean(income_log)),
Avg_Age = mean(age),
Avg_web = mean(num_web_visits_month),
Avg_purchases = mean(num_deals_purchases),
Avg_tot_spent=mean(total_spent)
) %>%
arrange(desc(Avg_tot_spent))
print(profile_teigen)
## # A tibble: 6 × 11
## Cluster_Teigen Count Avg_Wines Avg_Meat Avg_Gold Avg_Fish Avg_Income Avg_Age
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 691 546. 412. 74.1 96.3 70963. 56.4
## 2 6 357 548. 148. 77.4 25.5 57490. 57.2
## 3 5 227 264. 37.3 26.5 2.99 46711. 58.0
## 4 1 251 120. 62.7 23.7 13.9 40605. 55.2
## 5 4 423 12.3 13.5 12.0 6.85 25966. 49.7
## 6 2 262 25.4 8.13 4.11 1.05 34609. 55.7
## # ℹ 3 more variables: Avg_web <dbl>, Avg_purchases <dbl>, Avg_tot_spent <dbl>
Further discrimination is again made against heavy wine drinkers. The two new clusters concern consumers who only buy wine and those who essentially only buy wine and meat.
df_clean$Segmento_teg <- factor(df_clean$Cluster_Teigen,
levels = c("3","6","5" ,"1", "4", "2"),
labels = c("Premium",
"High Potential",
"Wine Lovers only",
"Wine and meat",
"Value Seekers",
"Budget/Low Income"))
table(df_clean$Segmento_teg)
##
## Premium High Potential Wine Lovers only Wine and meat
## 691 357 227 251
## Value Seekers Budget/Low Income
## 423 262
prob_mat <- mod_teigen$fuzzy
uncertainty_vec <- 1 - apply(prob_mat, 1, max)
df_clean$Uncertainty_teg <- uncertainty_vec
df_clean$Cluster_Teigen <- cluster_teigen
uncertainty_summary <- df_clean %>%
group_by(Cluster_Teigen) %>%
summarise(
Count = n(),
Min_Unc = min(Uncertainty_teg),
Mean_Unc = mean(Uncertainty_teg),
Median_Unc = median(Uncertainty_teg),
Max_Unc = max(Uncertainty_teg),
SD_Unc = sd(Uncertainty_teg)
) %>%
arrange(desc(Mean_Unc))
print(uncertainty_summary)
## # A tibble: 6 × 7
## Cluster_Teigen Count Min_Unc Mean_Unc Median_Unc Max_Unc SD_Unc
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 251 0.000914 0.110 0.0460 0.565 0.134
## 2 6 357 0.000457 0.104 0.0343 0.517 0.140
## 3 5 227 0.0000692 0.0940 0.0230 0.518 0.136
## 4 4 423 0.000204 0.0679 0.0143 0.572 0.111
## 5 2 262 0.0000356 0.0509 0.00458 0.522 0.0986
## 6 3 691 0.0000000433 0.0324 0.0000783 0.573 0.0924
#vengono evidenziati solo i punti ad alta incertezza
threshold <- 0.2
uncertain_points <- df_plot_teigen[df_clean$Uncertainty > threshold, ]
## Warning: Unknown or uninitialised column: `Uncertainty`.
ggplot(df_plot_teigen, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(data = df_plot_teigen, color = "grey90", alpha = 0.5, size = 1) +
geom_point(data = uncertain_points, aes(color = Cluster), size = 2, alpha = 0.8) +
stat_ellipse(geom = "polygon", alpha = 0.1, level = 0.95) +
scale_color_brewer(palette = "Set1") +
labs(
title = "Uncertain Cases",
x = "PC1",
y = "PC2"
) +
theme_minimal()
With this model, there is less uncertainty in associating observations with clusters.
uncertainty <- 1 - apply(mod_teigen$fuzzy, 1, max)
df_plot_unc_teigen <- data.frame(
PC1 = pca_res$x[, 1],
PC2 = pca_res$x[, 2],
Cluster = factor(apply(mod_teigen$fuzzy, 1, which.max)),
Uncertainty = uncertainty
)
df_plot_unc_teigen <- df_plot_unc_teigen[order(df_plot_unc_teigen$Uncertainty), ]
ggplot(df_plot_unc_teigen, aes(x = PC1, y = PC2, color = Uncertainty)) +
geom_point(size = 2, alpha = 0.8) +
stat_ellipse(aes(group = Cluster), color = "gray40", alpha = 0.3, type = "norm") +
scale_color_gradient(low = "blue", high = "red") +
theme_minimal() +
labs(
title = "Uncertainty Classification",
subtitle = "Blue = Confident classification | Red = High uncertainty (Overlap zones)",
x = "PC1",
y = "PC2",
color = "Uncertainty"
)
Finally, we applied Box Cox transformations to the highly skewed variables in order to apply k-means (despite knowing that this would not yield great results, given the nature of the data). As expected, it does not discriminate the data well, identifying two large clusters.
vars_to_cluster <- grep("mnt_", names(df_clean), value = TRUE)
data_cluster <- df_clean[, vars_to_cluster]
#Trasformazione Box Cox
apply_boxcox <- function(x) {
if(min(x) <= 0) {
x <- x + 1
}
# Calcola il lambda ottimale
bc_obj <- BoxCoxTrans(x)
# Applica la trasformazione
x_trans <- predict(bc_obj, x)
return(x_trans)
}
data_transformed <- data_cluster %>%
mutate(across(everything(), apply_boxcox))
data_scaled <- scale(data_transformed)
fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
ggtitle("Optimal K")
set.seed(1926)
k_final <- 2
km_res <- kmeans(data_scaled, centers = k_final, nstart = 25)
df_clean$Cluster_Box <- as.factor(km_res$cluster)
#visualizzazione PCA
fviz_cluster(km_res, data = data_scaled,
palette = "jco",
ggtheme = theme_minimal(),
main = "Clustering after Box-Cox Transformation")
df_clean %>%
group_by(Cluster_Box) %>%
summarise(across(all_of(vars_to_cluster), mean)) %>%
print()
## # A tibble: 2 × 7
## Cluster_Box mnt_wines mnt_fruits mnt_meat_products mnt_fish_products
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 94.5 3.93 29.9 5.42
## 2 2 524. 49.6 310. 70.9
## # ℹ 2 more variables: mnt_sweet_products <dbl>, mnt_gold_prods <dbl>
The model that uses generalized hyperbolic distribution tends to perform better in terms of ICL and BIC. BOX-COX transformations were tested purely out of curiosity. Given the nature of the data, outliers were cut, determining them using the interquartile range, but the results did not show any significant improvement. In particular, for the model that uses generalized hyperbolic distributions, the evaluation metrics improved by a few points, while the Teigen model lost much of its power and greatly increased its uncertainty in the allocation of observations.